
#########################
#### content.R #########
#######################



# This file runs all text analyses, including topic models and word frequency analyses
# The file directly produces figure s4
# The words collected in figure 4 are drawn from the lists produced in the "frequency" section below


# set options and load packages ####
options(scipen=10)

library(pacman)
p_load(here, dplyr, ggplot2, stringr, vroom, readxl, antiword, purrr, 
       googlesheets4, magrittr, tidyverse, bit64, gridExtra, corrplot, fastDummies, 
       stringdist, here, tm, RTextTools, lubridate, qdapDictionaries, SnowballC, stm,
       discrim, naivebayes, kknn, ranger, fastDummies, textclean,
       tidymodels, textrecipes, janitor)
i_am("Code/06_content.R")
#


# jury-level speech: load and clean ####

# read in jury-level and juror-level data
load(here("Data", "jury_with_numbers.RData"))
load(here("Data", "juror_level.RData"))
# use juror data to calculate jury attributes
juror <- juror %>% 
  group_by(jurynum) %>%
  mutate(n_dissenters_scale = sum(dissenter_scale, na.rm=T),
         n_dissenters_doll = sum(dissenter_doll, na.rm=T),
         n_higher_scale = sum(side_scale=="higher", na.rm=T),
         n_higher_doll = sum(side_doll=="higher", na.rm=T),
         n_lower_scale = sum(side_scale=="lower", na.rm=T),
         n_lower_doll = sum(side_doll=="lower", na.rm=T),
         n_white_scale=sum(dissenter_scale==TRUE&white==1),
         n_nonwhite_scale=sum(dissenter_scale==TRUE&white==0),
         n_white_doll=sum(dissenter_doll==TRUE&white==1),
         n_nonwhite_doll=sum(dissenter_doll==TRUE&white==0)) %>%
  select(jurynum, n_dissenters_scale:n_nonwhite_doll) %>% distinct() 
# merge in jury attributes
dat <- left_join(dat, juror)

# recode/reshape to 1 round per row
tab1 <- dat %>%
  select(text1, text2, jurynum) %>%
  pivot_longer(cols=c(text1, text2), names_to="round", values_to="text", names_prefix = "text") %>%
  filter(!is.na(text)) %>%
  left_join(dat %>% select(whites, doll_prefs, scale_prefs, sd_pre, order, scenario, jurynum, n_dissenters_scale, n_dissenters_doll, n_white_scale:n_nonwhite_doll) %>% distinct()) %>%
  mutate(type = case_when(order=="A"&round==1~"Dollars",
                          order=="A"&round==2~"Scale",
                          order=="B"&round==1~"Scale",
                          order=="B"&round==2~"Dollars")) %>%
  mutate(race_kind = case_when(whites==6~"6 whites",
                               whites==5~"5 whites",
                               whites<5~"4 or fewer whites"))
#calculate length
tab1$length <- str_count(tab1$text, pattern = " ")

# create composition variables
tab1 <- tab1 %>%
  mutate(whites6 = case_when(whites==6~1, 
                             T~0)) %>%
  mutate(whites5 = case_when(whites==5~1, 
                             T~0)) %>%
  mutate(whites4 = case_when(whites<5~1, 
                             T~0)) %>%
  mutate(across(whites6:whites4, as.factor))
# create other jury variables
tab1 <- tab1 %>%
  mutate(type_f = as.factor(case_when(type=="Dollars"~1,
                                      type=="Scale"~0))) %>%
  mutate(diss_type = case_when(type=="Dollars"&n_dissenters_doll>3~"hi",
                               type=="Dollars"&n_dissenters_doll<=3~"lo",
                               type=="Scale"&n_dissenters_scale>3~"hi",
                               type=="Scale"&n_dissenters_scale<=3~"lo"))
tab1 <- tab1 %>% filter(!is.na(tab1$diss_type))
tab1$group <- paste(tab1$type, tab1$diss_type, tab1$round, sep="_")
tab1 <- fastDummies::dummy_cols(tab1, select_columns = "group")

tab1 <- tab1 %>%
  mutate(n_dissenters = case_when(type=="Dollars"~n_dissenters_doll,
                                  type=="Scale"~n_dissenters_scale)) %>%
  select(-n_dissenters_scale, -n_dissenters_doll) %>%
  mutate(n_white_dissenters = case_when(type=="Dollars"~n_white_doll,
                                  type=="Scale"~n_white_scale)) %>%
  mutate(n_nonwhite_dissenters = case_when(type=="Dollars"~n_nonwhite_doll,
                                        type=="Scale"~n_nonwhite_scale)) %>%
  select(-contains("white_doll"), -contains("white_scale"))
  
# reload and recode juror-level data for more jury-level variables
load(here("Data", "juror_level.RData"))
juror <- juror %>%
  group_by(jurynum) %>%
  mutate(scale_white_higher = sum(white==1&side_scale=="higher", na.rm=T)>0,
         scale_nonwhite_higher = sum(white==0&side_scale=="higher", na.rm=T)>0,
         doll_white_higher = sum(white==1&side_doll=="higher", na.rm=T)>0,
         doll_nonwhite_higher = sum(white==0&side_doll=="higher", na.rm=T)>0) %>%
  select(jurynum, scale_white_higher:doll_nonwhite_higher) %>%
  distinct()
tab1 <- left_join(tab1, juror)

tab1 <- tab1 %>%
  mutate(white_higher = case_when(type=="Dollars"~doll_white_higher,
                                  type=="Scale"~scale_white_higher)) %>%
  mutate(nonwhite_higher = case_when(type=="Dollars"~doll_nonwhite_higher,
                                     type=="Scale"~scale_nonwhite_higher)) %>%
  select(-doll_white_higher, -scale_white_higher, -doll_nonwhite_higher, -scale_nonwhite_higher)

# individual-level speech: load and clean ####

load(here("Data", "juror_level.RData"))
dat <- juror

# load less-processed speech data and select relevant variables
speech <- read.csv(here("Data", "jurorspeechtrial.csv"))
speech <- speech %>%
  select(case_id, jurortext, jurortext_rd1, jurortext_rd2)

dat <- left_join(dat, speech)

# recode for analysis
tab1 <- dat %>%
  select(jurortext_rd1, jurortext_rd2, case_id) %>%
  pivot_longer(cols=c(jurortext_rd1, jurortext_rd2), names_to="round", values_to="text", names_prefix = "jurortext_rd") %>%
  filter(!is.na(text)) %>%
  left_join(dat %>% select(case_id, white, idoll, iscale, order, scenario, jurynum, dissenter_scale, dissenter_doll, side_doll, side_scale) %>% distinct()) %>%
  mutate(type = case_when(order=="A"&round==1~"Dollars",
                          order=="A"&round==2~"Scale",
                          order=="B"&round==1~"Scale",
                          order=="B"&round==2~"Dollars")) %>%
  mutate(dissenter = case_when(type=="Dollars"&dissenter_doll==TRUE~1,
                               type=="Scale"&dissenter_scale==TRUE~1,
                               T~0)) %>%
  mutate(diss_high = case_when(type=="Dollars"&side_doll=="higher"~1,
                               type=="Scale"&side_scale=="higher"~1,
                               T~0))
tab1$length <- str_count(tab1$text, "\\S+")
dat$fore <- str_detect(tolower(dat$Identifier), "fore")
tab1 <- left_join(tab1, dat %>% select(case_id, fore))


########################
### TOPIC MODELS #######
########################
# topic models: juries by composition ####
# load jury-level data
tab_orig <- tab1

#preprocessing:
  # remove identity signifiers
tab1$text <- str_remove_all(tab1$text, "Man: |Woman: |Foreman: |Forewoman: ")
  # remove punctuation
tab1$text <- str_remove_all(tab1$text, "[//://.,?'_]|-")
  # remove proper names
tab1$text <- replace_names(tab1$text)
  # change to lowercase
tab1$text <- tolower(tab1$text)
  # remove symbols
tab1$text <- replace_symbol(tab1$text)
  # stem words
tab1$text <- stemDocument(tab1$text)

# remove stopwords
stops <- c("the", "that", "to", "it", "a", "an", "and", "of", "is", "have", "was", "in", "be", "inaudible", "unclear")
stops <- paste("\\b", paste(stops, collapse="\\b|\\b"), "\\b", sep="")
tab1$text <- str_remove_all(tab1$text, stops)

# remove additional punctuation
tab1$text <- str_remove_all(tab1$text, "[\\-$=_`~|]")
# transform documents for analysis
temp <- textProcessor(documents=tab1$text, metadata = tab1, ucp=TRUE,
                      customstopwords = c("envelope"))
meta <- temp$meta
vocab <- temp$vocab
head(vocab)
docs <- temp$documents
out <- prepDocuments(docs, vocab, meta, lower.thresh = 5)
docs <- out$documents
vocab <- out$vocab
head(vocab)
meta <- out$meta

# run set of topic models
mod1 <- selectModel(docs,vocab, K=30, 
                    prevalence = ~factor(scenario)+factor(round)+type+race_kind, 
                    data=meta, seed = 4886, 
                    runs=50, emtol=.0001)

# look at model descriptives
plotModels(mod1)
labelTopics(mod1$runout[[1]], n=50) # repeat for more models by replacing "1"

# check for effects of covariates (again, repeat for others by replacing 1)
prep <- estimateEffect(c(1:30) ~ factor(scenario)+factor(round)+type+race_kind,
                       mod1$runout[[1]],meta=meta,uncertainty="Global",
                       documents=docs)

# validate by checking for models related to scenarios and rounds
plot.estimateEffect(prep,
                    covariate=c("scenario"),
                    topics=c(30),
                    printlegend=TRUE,
                    xlab="Effect",
                    main = "Scenario Effects on Topic Likelihood",
                    labeltype="custom",
                    custom.labels=c(1,13,10,7,4,5,14,11,8,2,6,3,15,12,9),
                    nsims=200)
plot.estimateEffect(prep,
                    covariate=c("round"),
                    topics=c(1:30),
                    printlegend=TRUE,
                    xlab="Effect",
                    main = "Effects on Topic Likelihood: Round 1 vs. 2",
                    method="difference",
                    cov.value1=1,
                    cov.value2=2,
                    labeltype="custom",
                    custom.labels=c(1:30),
                    nsims=200)
plot.estimateEffect(prep,
                    covariate=c("type"),
                    topics=c(1:30),
                    printlegend=TRUE,
                    xlab="Effect",
                    main = "Effects on Topic Likelihood: Dollars vs. Scale",
                    method="difference",
                    cov.value1="Dollars",
                    cov.value2="Scale",
                    labeltype="custom",
                    custom.labels=c(1:30),
                    nsims=200)

# now pull out all race differences across all model runs
tab1 <- data.frame("model"=NA, "topic"=NA, "t5"=NA, "t6"=NA)
for(j in 1:10){
prep <- estimateEffect(c(1:30) ~ factor(scenario)+factor(round)+type+race_kind,
                       mod1$runout[[j]],meta=meta,uncertainty="Global",
                       documents=docs)

sum_prep <- summary(prep)
t_stats <- data.frame("model"=rep(NA, 30), "topic"=rep(NA, 30), "t5"=rep(NA, 30), "t6"=rep(NA, 30))
for(i in 1:30){
  t_stats$model[i] <- j
  t_stats$topic[i] <- i
  t_stats$t5[i] <- sum_prep$tables[[i]][,3]["race_kind5 whites"]
  t_stats$t6[i] <- sum_prep$tables[[i]][,3]["race_kind6 whites"]
}
tab1 <- bind_rows(tab1, t_stats)
}

# Figure s3: T-statistics for differences in topic usage by group racial composition  ####

tab1 %>%
  filter(!is.na(model)) %>%
  pivot_longer(cols=c(t5, t6), names_to="coefname", values_to = "tstat") %>%
  mutate(Composition = case_when(coefname=="t5"~"5 whites",
                                 coefname=="t6"~"6 whites")) %>%
  ggplot() +
  geom_histogram(aes(x=tstat, fill=Composition), position="identity", alpha=.2, color="black") +
  theme_bw() + xlab("T Statistic") + ylab("Count") +
  theme(text=element_text(size=20))

# topic models: white and nonwhite jurors ####

# load individual-level data, then run the following: 
tab_orig <- tab1

#preprocessing
  # remove incorrectly parsed symbols, then repeat remaining steps as before
tab1$text <- str_remove_all(tab1$text, "\x92")
tab1$text <- str_remove_all(tab1$text, "[//://.,?'_]|-")
tab1$text <- replace_names(tab1$text)
tab1$text <- tolower(tab1$text)
tab1$text <- replace_symbol(tab1$text)
tab1$text <- stemDocument(tab1$text)

stops <- c("the", "that", "to", "it", "a", "an", "and", "of", "is", "have", "was", "in", "be", "inaudible", "unclear")
stops <- paste("\\b", paste(stops, collapse="\\b|\\b"), "\\b", sep="")
tab1$text <- str_remove_all(tab1$text, stops)

tab1$text <- str_remove_all(tab1$text, "[\\-$=_`~|]")
temp <- textProcessor(documents=tab1$text, metadata = tab1, ucp=TRUE,
                      customstopwords = c("envelope"))
meta <- temp$meta
vocab <- temp$vocab
head(vocab)
docs <- temp$documents
out <- prepDocuments(docs, vocab, meta, lower.thresh = 5)
docs <- out$documents
vocab <- out$vocab
head(vocab)
meta <- out$meta

# run set of models
mod1 <- selectModel(docs,vocab, K=30, 
                    prevalence = ~factor(scenario)+factor(round)+type+white, 
                    data=meta, seed = 4886, 
                    runs=50, emtol=.0001)

# check models
plotModels(mod1)
labelTopics(mod1$runout[[10]], n=50) # repeat for different numbers

# check effects of covariates
prep <- estimateEffect(c(1:30) ~ factor(scenario)+factor(round)+type+white,
                       mod1$runout[[10]],meta=meta,uncertainty="Global",
                       documents=docs)

# validate: check for scenario and round-specific topics
plot.estimateEffect(prep,
                    covariate=c("scenario"),
                    topics=c(8),
                    printlegend=TRUE,
                    xlab="Effect",
                    main = "Scenario Effects on Topic Likelihood",
                    #labeltype="custom",
                    #custom.labels=c(1,13,10,7,4,5,14,11,8,2,6,3,15,12,9),
                    nsims=200)
plot.estimateEffect(prep,
                    covariate=c("round"),
                    topics=c(1:30),
                    printlegend=TRUE,
                    xlab="Effect",
                    main = "Effects on Topic Likelihood: Round 1 vs. 2",
                    method="difference",
                    cov.value1=1,
                    cov.value2=2,
                    labeltype="custom",
                    custom.labels=c(1:30),
                    nsims=200)
plot.estimateEffect(prep,
                    covariate=c("type"),
                    topics=c(1:30),
                    printlegend=TRUE,
                    xlab="Effect",
                    main = "Effects on Topic Likelihood: Dollars vs. Scale",
                    method="difference",
                    cov.value1="Dollars",
                    cov.value2="Scale",
                    labeltype="custom",
                    custom.labels=c(1:30),
                    nsims=200)
# look at race differences
plot.estimateEffect(prep,
                    covariate=c("white"),
                    topics=c(1:30),
                    printlegend=TRUE,
                    xlab="Effect",
                    main = "White - Nonwhite Diff. by Topic",
                    method="difference",
                    cov.value1="1",
                    cov.value2="0",
                    labeltype="custom",
                    custom.labels=c(1:30),
                    nsims=500)

# investigate model with race differences---this may or may not appear on your run
smods <- as.data.frame(mod1$runout[[10]]$theta)
smods <- janitor::clean_names(dat = smods)
smods <- bind_cols(meta, smods)

# estimate effects of race and foreperson status
p_load(estimatr)
lm_robust(v2~factor(scenario)+factor(round)+type+white+fore, 
          clusters=jurynum, 
          data=smods)


##################
### FREQUENCY ####
##################

tab_orig <- tab1
# load juror-level data
# juror-level: create word use by race datasets ####
p_load(tidytext)

# list common stopwords
stops <- c("the", "that", "to", "it", "a", "an", "and", "of", "is", "have", "was", "in", "be", "inaudible", "unclear", "garbled", "tape",
           "this", "on", "for", "were", "with", "or", "at", "are", "their", "had", "as", "oh", "okay", "like",
           "mm", "hmm", "huh", 
           "isnt", "weve", "theyv", "into", "id", "theyr", "youv", "ill", "hes", "el", "am", 
           #most common spoken words from https://ucrel.lancs.ac.uk/bncfreq/flists.html
           "i", "you", "we", "do", "they", "uh", "uhh", "um", "umm", "yeah", "what", "but", "know", "well", "so", "got", "not",
           "no", "he", "she", "there", "think", "yes", "just", "all", "can", "then", "get", "did", "or", "would", "them",
           "there", "go", "now", "your", "had", "about", "said", "see", "me", "very", "out", "my", "when", "mean", "right",
           "which", "from", "going", "say", "been", "because", "some", "could", "will", "how", "on", "really", "come", "by")
# replace digits with words
tab1$text <- str_replace_all(tab1$text, "\\b1\\b", "one")
tab1$text <- str_replace_all(tab1$text, "\\b2\\b", "two")
tab1$text <- str_replace_all(tab1$text, "\\b3\\b", "three")
tab1$text <- str_replace_all(tab1$text, "\\b4\\b", "four")
tab1$text <- str_replace_all(tab1$text, "\\b5\\b", "five")
tab1$text <- str_replace_all(tab1$text, "\\b6\\b", "six")
tab1$text <- str_replace_all(tab1$text, "\\b7\\b", "seven")
tab1$text <- str_replace_all(tab1$text, "\\b8\\b", "eight")
tab1$text <- str_replace_all(tab1$text, "\\b0\\b", "zero")

# repeat other cleaning steps
tab1$text <- str_remove_all(tab1$text, "\x92")
tab1$text <- str_replace_all(tab1$text, "-", " ")
tab1$text <- str_replace_all(tab1$text, "<U.*>", " ")
tab1$text <- str_remove_all(tab1$text, "'")
tab1$text <- str_replace_all(tab1$text, "[//://.,?_]", " ")
tab1$text <- replace_names(tab1$text)
tab1$text <- tolower(tab1$text)
tab1$text <- replace_symbol(tab1$text)

#stem, but exclude majority/major from stemming
tab1$text <- str_replace_all(tab1$text, "majority", "majorityabc")
tab1$text <- stemDocument(tab1$text)
tab1$text <- str_replace_all(tab1$text, "majorityabc", "majority")

#remove special characters
tab1$text <- str_remove_all(tab1$text, "[\\-$=_`~|]")

# remove garbled words/numbers
bad_words <- c("l", "c", "e", "b", "r", "s", "000", "200", "100", "10",
               "a", "t", "50", "20", "500", "o", "25", "y", "250", "30",
               "300", "15", "h", "d", "400", "n", "150", "ut", "m",
               "t", "40", "ll", "12", "ou", "g", "w")

# count words used by race of deliberator
book_words <- tab1 %>%
  group_by(white) %>%
  unnest_tokens(word, text) %>%
  count(white, word, sort = TRUE)

total_words <- book_words %>% group_by(white) %>% summarize(total = sum(n))
book_words <- left_join(book_words, total_words)

# calculate tf, idf, and tf_idf
book_words <- book_words %>%
  bind_tf_idf(word, white, n) 

# calculate modified idf: prop w/in groups:
  # what proportion of all words by this group are this word?
race_words <- tab1 %>%
  ungroup() %>%
  group_by(white) %>%
  unnest_tokens(word, text) %>%
  count(white, word, sort = TRUE) %>%
  group_by(white) %>%
  mutate(total = sum(n)) %>%
  mutate(tf = n/total)
  # how many people use this word?
book_words <- tab1 %>%
  group_by(case_id) %>%
  unnest_tokens(word, text) %>%
  count(case_id, word, sort = TRUE)
  # merge unique observations
book_words <- left_join(book_words, tab1 %>% select(case_id, white)) %>% distinct() %>% select(-jurynum)
  # total people by race
race_totals <- book_words %>%
  select(case_id, white) %>%
  distinct() %>%
  group_by(white) %>%
  mutate(people_race = n()) %>%
  select(people_race, white) %>%
  distinct()
  # merge observations
book_words <- left_join(book_words, race_totals)
  # what proportion of people of each race use this word?
race_use <- book_words %>%
  group_by(white, word) %>%
  mutate(count=n()) %>%
  select(word, white, people_race, count) %>%
  distinct() %>%
  mutate(prop = count/people_race)

# merge together
comb <- left_join(race_words, race_use %>% select(word, white, prop, people_race, count))
# remove garbled and nonsense words
bad_words <- c("l", "c", "e", "b", "r", "s", "000", "200", "100", "10",
               "a", "t", "50", "20", "500", "o", "25", "y", "250", "30",
               "300", "15", "h", "d", "400", "n", "150", "ut", "m",
               "t", "40", "ll", "12", "ou", "g", "w")
comb <- comb %>% filter(!(word %in% bad_words))
comb <- comb %>%
  rename("n_jurors"=count)

# check whether each word is used in more than 3 scenarios by each group
sc_words <- list()
for(i in 1:15){
  sc_words[[i]] <- tab1 %>% 
    ungroup() %>% group_by(white) %>%
    filter(scenario==i) %>% 
    unnest_tokens(word, text) %>%
    count(white, word, sort = TRUE) %>%
    filter(n>3) %>%
    ungroup() %>%
    select(word)
}

# total scenarios appearing
sc_words <- lapply(sc_words, unlist)
comb$sc_appears <- unlist(lapply(comb$word, function(y) sum(unlist(lapply(sc_words, function(x) y %in% x)))))

# clean for presentation
comb1 <- comb %>% 
  filter(white==1) %>%
  select(word, tf, prop, n, total, people_race, n_jurors, sc_appears) %>%
  rename("prop_white" = prop) %>%
  ungroup()
comb0 <- comb %>% 
  filter(white==0) %>%
  select(word, tf, prop, n, total, people_race, n_jurors, sc_appears) %>%
  rename("prop_nonwhite" = prop) %>%
  ungroup()

# join by race
comb1 <- left_join(comb1, comb0 %>% select(word, prop_nonwhite))
comb0 <- left_join(comb0, comb1 %>% select(word, prop_white))

# adjust by proportions
comb1 <- comb1 %>%
  mutate(idf_mod = case_when(prop_nonwhite>=prop_white~0,
                             prop_white>prop_nonwhite~tf*(log(prop_white/prop_nonwhite)))) %>%
  mutate(idf_modb = case_when(prop_nonwhite>=prop_white~0,
  prop_white>prop_nonwhite~tf*(prop_white-prop_nonwhite)/prop_white))

comb0 <- comb0 %>%
  mutate(idf_mod = case_when(prop_white>=prop_nonwhite~0,
                             prop_nonwhite>prop_white~tf*(log(prop_nonwhite/prop_white)))) %>%
  mutate(idf_modb = case_when(prop_white>=prop_nonwhite~0,
                              prop_nonwhite>prop_white~tf*(prop_nonwhite-prop_white)/prop_nonwhite))
rm(book_words)




# CALCULATE PERCENTILE DIFFERENCES ####

# merge white and poc lists, filter less-used words, rename variables
percs <- left_join(comb1 %>% select(word, n, tf, n_jurors, sc_appears),
                   comb0 %>% select(word, n, tf, n_jurors),
                   by="word") %>%
  filter(n_jurors.x>10&n_jurors.y>10) %>%
  filter(!(word %in% stops)) %>%
  filter(sc_appears>=3) %>%
  rename("tf_white"=tf.x) %>%
  rename("tf_nonwhite"=tf.y)

# code differences in tf and calculate differences in percentiles
percs$tf_diff <- percs$tf_white - percs$tf_nonwhite
percs$p_white <- ecdf(percs$tf_white)(percs$tf_white)
percs$p_nonwhite <- ecdf(percs$tf_nonwhite)(percs$tf_nonwhite)
percs$p_diff <- percs$p_white - percs$p_nonwhite
# view most different words
percs %>%
  filter(p_diff>.05) %>%
  arrange(-p_diff) %>%
  select(word) %>%
  as.list()

# view most different words
percs %>%
  filter(p_diff<(-.05)) %>%
  arrange(p_diff) %>%
  select(word) %>%
  as.list()

# store differences
percs_tf <- percs

# PERCENTILE: PROP APPEARING

# merge and calculate differences
percs <- left_join(comb1 %>% select(word, n, prop_white, n_jurors, sc_appears),
                   comb0 %>% select(word, n, prop_nonwhite, n_jurors),
                   by="word") %>%
  filter(n_jurors.x>10&n_jurors.y>10) %>%
  filter(!(word %in% stops)) %>%
  filter(sc_appears>=3)
percs$p_white <- ecdf(percs$prop_white)(percs$prop_white)
percs$p_nonwhite <- ecdf(percs$prop_nonwhite)(percs$prop_nonwhite)
percs$p_diff <- percs$p_white - percs$p_nonwhite

# print most different words
percs %>%
  filter(p_diff>.05) %>%
  arrange(-p_diff) %>%
  select(word) %>%
  as.list()

# print most different words
percs %>%
  filter(p_diff<(-.05)) %>%
  arrange(p_diff) %>%
  select(word) %>%
  as.list()

# store differences
percs_pu <- percs

# combine

percs_tf <- percs_tf %>%
  rename("uses_white"=n.x, "uses_poc"=n.y,
         "n_jurors_white"=n_jurors.x, "n_jurors_poc"=n_jurors.y,
         "p_white_tf"=p_white, "p_nonwhite_tf"=p_nonwhite,
         "p_diff_tf"=p_diff) 
percs_pu <- percs_pu %>%
  rename("uses_white"=n.x, "uses_poc"=n.y,
         "n_jurors_white"=n_jurors.x, "n_jurors_poc"=n_jurors.y,
         "p_white_pu"=p_white, "p_nonwhite_pu"=p_nonwhite,
         "p_diff_pu"=p_diff) 

# combine and store top words
tops <- full_join(percs_tf, percs_pu)
tops$top_white_tf <- tops$p_diff_tf>.05
tops$top_poc_tf <- tops$p_diff_tf<(-.05)
tops$top_white_pu <- tops$p_diff_pu>.05
tops$top_poc_pu <- tops$p_diff_pu<(-.05)

tops$top_white <- (tops$top_white_pu+tops$top_white_tf)==2
tops$top_poc <- (tops$top_poc_pu+tops$top_poc_tf)==2

# view most different words
tops %>% filter(top_white==TRUE) %>% arrange(-p_diff_pu) %>% select(word) %>% as.list()
tops %>% filter(top_poc==TRUE) %>% arrange(p_diff_pu) %>% select(word) %>% as.list()
# save results
write.csv(tops, here("Data", "word_differences.csv"))

# view overall most-different words ####

# top proportion diff: white
comb1 %>%
  select(word, tf, prop_white, prop_nonwhite, idf_mod) %>%
  mutate(prop_diff = prop_white-prop_nonwhite) %>%
  filter(!(word %in% ("s"))) %>%
  filter(!(word %in% stops)) %>%
  slice_max(prop_diff, n=50) %>%
  mutate(across(c(tf, idf_mod), function(x) x * 100)) %>%
  mutate(across(tf:idf_mod, function(x) round(x, 2))) %>%
  View()

#top proportion diff: nonwhite
comb0 %>%
  select(word, tf, prop_white, prop_nonwhite, idf_mod) %>%
  mutate(prop_diff = prop_nonwhite-prop_white) %>%
  filter(!(word %in% ("s"))) %>%
  filter(!(word %in% stops)) %>%
  slice_max(prop_diff, n=50) %>%
  mutate(across(c(tf, idf_mod), function(x) x * 100)) %>%
  mutate(across(tf:idf_mod, function(x) round(x, 2))) %>%
  View()

#top TF diff:
comb1 %>%
  left_join(comb0 %>% select(word, tf), by="word") %>%
  mutate(tf_diff = tf.x-tf.y) %>%
  filter(!(word %in% stops)) %>%
  slice_min(tf_diff, n=50) %>%
  View()


# FIND DISCRIMINATING WORDS ####

# store common stopwords
stops <- c("the", "that", "to", "it", "a", "an", "and", "of", "is", "have", "was", "in", "be", "inaudible", "unclear", "garbled", "tape",
           "this", "on", "for", "were", "with", "or", "at", "are", "their", "had", "as", "oh", "okay", "like",
           "mm", "hmm", "huh", 
           "isnt", "weve", "theyv", "into", "id", "theyr", "youv", "ill", "hes", "el", "am", 
           #most common spoken words from https://ucrel.lancs.ac.uk/bncfreq/flists.html
           "i", "you", "we", "do", "they", "uh", "uhh", "um", "umm", "yeah", "what", "but", "know", "well", "so", "got", "not",
           "no", "he", "she", "there", "think", "yes", "just", "all", "can", "then", "get", "did", "or", "would", "them",
           "there", "go", "now", "your", "had", "about", "said", "see", "me", "very", "out", "my", "when", "mean", "right",
           "which", "from", "going", "say", "been", "because", "some", "could", "will", "how", "on", "really", "come", "by")

# store words to keep
keeps <- c("if", "dont", "one", "compani", "becaus", "dollar", "should", "put", "million", "five", "two", "make", "up", "didnt", "four", "want", "more", "thing", "punish", "peopl", "sever", "three", "someth", "other", "money", "damag", "her", "take", "gonna", "here", "agre", "time", "down", "much", "whi", "punit", "six", "let", "need", "realli", "amount", "happen", "too", "year", "way", "substanti", "ani", "even", "still", "mayb", "has", "number", "look", "onli", "where", "again", "littl", "kind", "probabl", 
  "pay", "lot", "than", "feel", "give", "respons", "zero", "thought", "cant", "percent", "point", "person", "back", "anyth", "these", "problem", "product", "enough", "hundr", "also", "his", "case", "doe", "guy", "profit", "doesnt", "who", "went", "work", "sure", "eight", "seven", "fact", "somebodi", "good", "tri", "juri", "our", "us", "caus", "knew", "chang", "use", "figur", "law", "talk", "alreadi", "same", "between", "though", "compensatori", "reason", "those", "award", "first", "part", "veri", "might", 
  "hurt", "wasnt", "wouldnt", "done", "mild", "around", "differ", "decid", "life", "whatev", "extrem", "els", "big", "noth", "test", "everybodi", "off", "over", "standard", "half", "him", "made", "thousand", "start", "care", "minut", "shes", "disregard", "bit", "high", "anoth", "tell", "may", "whether", "exact", "suppos", "actual", "far", "consid", "fine", "befor", "guess", "after", "warn", "busi", "medic", "car", "inform", "mani", "decis", "child", "never", "neglig", "believ", "possibl", "stop", "through", 
  "least", "question", "anybodi", "cost", "malici", "level", "safeti", "wrong", "side", "felt", "read", "set", "fault", "defend", "hit", "whole", "either", "higher", "own", "insur", "someon", "better", "everyth", "futur", "reckless", "compens", "pretti", "base", "basic", "goe", "keep", "understand", "opinion", "next", "low", "bad", "man", "messag", "wont", "ive", "effect", "end", "awar", "everi", "live", "stuff", "mind", "theyll", "true", "kid", "action", "hand", "job", "send", "line", "none", "within", 
  "accid", "assum", "away", "lawyer", "word", "long", "most", "shouldnt", "cover", "middl", "obvious", "certain", "call", "real", "risk", "correct", "buy", "close", "court", "ladi", "abl", "employe", "matter", "paid", "parent", "slap", "rest", "fix", "limit", "show", "place", "ten", "deter", "open", "came", "gave", "safe", "manufactur", "issu", "total", "type", "alway", "under", "attent", "less", "public", "individu", "seem", "sorri", "bill", "idea", "kill", "sit", "took", "attorney", "ever", "gone", 
  "second", "given", "hear", "hey", "mine", "suffer", "sound", "along", "larg", "told", "pick", "ahead", "anyway", "find", "sue", "chose", "day", "run", "couldnt", "fair", "quit", "stay", "act", "chanc", "concern", "hold", "onc", "wait", "hire", "involv", "nobodi", "lost", "consum", "leav", "purpos", "each", "sell", "hour", "yet", "spend", "unless", "watch", "brought", "serious", "continu", "order", "check", "drive", "sometim", "across", "die", "depend", "price", "realiz", "proper", "receiv")

# load individual-level data, then:
tab_orig <- tab1
tab1$white <- as.factor(tab1$white)
tab1 <- tab1 %>% filter(text!="")

# process text: replace numbers, symbols, names, punctuation
tab1$text <- str_replace_all(tab1$text, "\\b1\\b", "one")
tab1$text <- str_replace_all(tab1$text, "\\b2\\b", "two")
tab1$text <- str_replace_all(tab1$text, "\\b3\\b", "three")
tab1$text <- str_replace_all(tab1$text, "\\b4\\b", "four")
tab1$text <- str_replace_all(tab1$text, "\\b5\\b", "five")
tab1$text <- str_replace_all(tab1$text, "\\b6\\b", "six")
tab1$text <- str_replace_all(tab1$text, "\\b7\\b", "seven")
tab1$text <- str_replace_all(tab1$text, "\\b8\\b", "eight")
tab1$text <- str_replace_all(tab1$text, "\\b0\\b", "zero")

tab1$text <- str_remove_all(tab1$text, "\x92")
tab1$text <- str_replace_all(tab1$text, "-", " ")
tab1$text <- str_replace_all(tab1$text, "<U.*>", " ")
tab1$text <- str_remove_all(tab1$text, "'")
tab1$text <- str_replace_all(tab1$text, "[//://.,?_]", " ")
tab1$text <- replace_names(tab1$text)
tab1$text <- tolower(tab1$text)
tab1$text <- replace_symbol(tab1$text)

#stem, but exclude majority/major
tab1$text <- str_replace_all(tab1$text, "majority", "majorityabc")
tab1$text <- stemDocument(tab1$text)
tab1$text <- str_replace_all(tab1$text, "majorityabc", "majority")

#remove special characters
tab1$text <- str_remove_all(tab1$text, "[\\-$=_`~|]")

# run model predicting juror race from word frequency
word_dat <- recipe(white ~ text, data = tab1) %>%
  step_filter(text != "") %>%
  step_tokenize(text, token="words") %>%
  step_stopwords(text, custom_stopword_source = stops) %>%
  step_tokenfilter(text, filter_fun = function(x) x %in% keeps) %>%
  step_tf(text) %>%
  prep(training = tab1)
word_prepped <- juice(word_dat) 
glmnet_model <- logistic_reg(mixture = 0, penalty = 0.1) %>%
  set_engine("glmnet")
word_model <- glmnet_model %>%
  fit(white ~ ., data = word_prepped)

# collect most-predictive words
diffs <- word_model$fit %>% 
  tidy() %>%
  mutate(term = str_replace(term, "tf_text_", "")) %>%
  filter(term!="(Intercept)"&step==100) 

# OVERALL WORD FREQUENCY RESULTS ####  

# read in percentile differences
perc_diffs <- read.csv(here("Data", "word_differences.csv")) %>% clean_names()
# join all differences together
all_diffs <- left_join(perc_diffs %>% select(word, p_diff_pu), 
                       diffs %>% select(term, estimate) %>%
                         rename("word"=term)) %>%
  rename("Percentile"=p_diff_pu, 
         "Prediction"=estimate)
# check relationships between measures
plot(all_diffs$Percentile, all_diffs$Prediction)
cor(all_diffs$Percentile, all_diffs$Prediction, use="pairwise")
summary(lm(Percentile~Prediction, data=all_diffs))

# clean all differences
all_diffs <- all_diffs[!is.na(all_diffs$Prediction),]

# list top words in each category
all_diffs$top_white_perc <- (all_diffs$word %in% perc_diffs$word[perc_diffs$top_white==TRUE])
all_diffs$top_poc_perc <- (all_diffs$word %in% perc_diffs$word[perc_diffs$top_poc==TRUE])
all_diffs$top_white_pred <- all_diffs$Prediction>=sort(all_diffs$Prediction, decreasing = TRUE)[50]
all_diffs$top_poc_pred <- all_diffs$Prediction<=sort(all_diffs$Prediction)[50]

# view top words in each category
all_diffs$word[all_diffs$top_white_perc==TRUE&all_diffs$top_white_pred==TRUE]
all_diffs$word[all_diffs$top_white_perc==TRUE&all_diffs$top_white_pred==FALSE]
all_diffs$word[all_diffs$top_white_perc==FALSE&all_diffs$top_white_pred==TRUE]

all_diffs$word[all_diffs$top_poc_perc==TRUE&all_diffs$top_poc_pred==TRUE]
all_diffs$word[all_diffs$top_poc_perc==TRUE&all_diffs$top_poc_pred==FALSE]
all_diffs$word[all_diffs$top_poc_perc==FALSE&all_diffs$top_poc_pred==TRUE]
